{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.sparse import csr_matrix, lil_matrix\n",
    "import scipy.sparse as sprs\n",
    "import sys\n",
    "import gc\n",
    "from tqdm.auto import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "def columns_zero_out(csr_massiv, columns): #DELETE PARTICULAR ROWS FROM CSR-MATRIX\n",
    "    \n",
    "    columns = columns.astype(int)\n",
    "    n = csr_massiv.shape[1]\n",
    "    \n",
    "    for j in tqdm(range(len(columns))):\n",
    "        data = np.ones(n-1)\n",
    "        indptr = [0]\n",
    "        indices = [0]\n",
    "        \n",
    "        for i in range(1, n):\n",
    "            indptr.append(indptr[i-1] + 1)\n",
    "            indices.append(indices[i-1] + 1)\n",
    "        \n",
    "        indptr.insert(columns[j], columns[j]) \n",
    "        del indices[columns[j]]\n",
    "        csr_massiv = csr_massiv.dot(csr_matrix((data, indices, indptr), shape=(n, n)))\n",
    "    \n",
    "    return csr_massiv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Download raw data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "cl_1_1_dem = np.load('raw_data/cl_1_1_dem.npy')\n",
    "cl_1_1_sub = sprs.load_npz('raw_data/cl_1_1_sub.npz')\n",
    "\n",
    "cl_1_2_dem = np.load('raw_data/cl_1_2_dem.npy')\n",
    "cl_1_2_sub = sprs.load_npz('raw_data/cl_1_2_sub.npz')\n",
    "\n",
    "cl_2_dem = np.load('raw_data/cl_2_dem.npy')\n",
    "cl_2_sub = sprs.load_npz('raw_data/cl_2_sub.npz')\n",
    "\n",
    "cl_3_dem = np.load('raw_data/cl_3_dem.npy')\n",
    "cl_3_sub = sprs.load_npz('raw_data/cl_3_sub.npz')\n",
    "\n",
    "cl_4_1_dem = np.load('raw_data/cl_4_1_dem.npy')\n",
    "cl_4_1_sub = sprs.load_npz('raw_data/cl_4_1_sub.npz')\n",
    "\n",
    "cl_4_2_dem = np.load('raw_data/cl_4_2_dem.npy')\n",
    "cl_4_2_sub = sprs.load_npz('raw_data/cl_4_2_sub.npz')\n",
    "\n",
    "cl_4_3_dem = np.load('raw_data/cl_4_3_dem.npy')\n",
    "cl_4_3_sub = sprs.load_npz('raw_data/cl_4_3_sub.npz')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Save matrices' sizes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "16027 76605 92144 93951 45506 46843 46202\n"
     ]
    }
   ],
   "source": [
    "cl_1_1_num = cl_1_1_dem.shape[0]\n",
    "\n",
    "cl_1_2_num = cl_1_2_dem.shape[0]\n",
    "\n",
    "cl_2_num = cl_2_dem.shape[0]\n",
    "\n",
    "cl_3_num = cl_3_dem.shape[0]\n",
    "\n",
    "cl_4_1_num = cl_4_1_dem.shape[0]\n",
    "cl_4_2_num = cl_4_2_dem.shape[0]\n",
    "cl_4_3_num = cl_4_3_dem.shape[0]\n",
    "\n",
    "print(cl_1_1_num, cl_1_2_num, \n",
    "      cl_2_num, \n",
    "      cl_3_num, \n",
    "      cl_4_1_num, cl_4_2_num, cl_4_3_num)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Save the sizes of classes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 92632  92144  93951 138551]\n"
     ]
    }
   ],
   "source": [
    "nums = np.array([cl_1_1_num + cl_1_2_num, \n",
    "                 cl_2_num, \n",
    "                 cl_3_num, \n",
    "                 cl_4_1_num + cl_4_2_num + cl_4_3_num])\n",
    "\n",
    "print(nums)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Stack sex, age and political preferences information into one matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(417278, 3)\n"
     ]
    }
   ],
   "source": [
    "dem = np.vstack((cl_1_1_dem, cl_1_2_dem, \n",
    "                 cl_2_dem, \n",
    "                 cl_3_dem, \n",
    "                 cl_4_1_dem, cl_4_2_dem, cl_4_3_dem))\n",
    "\n",
    "print(dem.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "162"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "del(cl_1_1_dem, cl_1_2_dem, cl_2_dem, cl_3_dem, cl_4_1_dem, cl_4_2_dem, cl_4_3_dem)\n",
    "\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(417278, 3)\n"
     ]
    }
   ],
   "source": [
    "dem = pd.DataFrame(dem)\n",
    "dem[[0, 1, 2]] = dem[[0, 1, 2]].apply(pd.to_numeric)\n",
    "\n",
    "dem = dem.values\n",
    "\n",
    "print(dem.shape)\n",
    "\n",
    "#dem.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Stack information on users' subscriptions into one matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "367\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "417278"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dim = max(cl_1_1_sub.shape[1], cl_1_2_sub.shape[1], \n",
    "          cl_2_sub.shape[1], \n",
    "          cl_3_sub.shape[1], \n",
    "          cl_4_1_sub.shape[1], cl_4_2_sub.shape[1], cl_4_3_sub.shape[1])\n",
    "\n",
    "print(dim)\n",
    "\n",
    "cl_1_1_sub.resize(cl_1_1_sub.shape[0], dim)\n",
    "cl_1_2_sub.resize(cl_1_2_sub.shape[0], dim)\n",
    "\n",
    "cl_2_sub.resize(cl_2_sub.shape[0], dim)\n",
    "\n",
    "cl_3_sub.resize(cl_3_sub.shape[0], dim)\n",
    "\n",
    "cl_4_1_sub.resize(cl_4_1_sub.shape[0], dim)\n",
    "cl_4_2_sub.resize(cl_4_2_sub.shape[0], dim)\n",
    "cl_4_3_sub.resize(cl_4_3_sub.shape[0], dim)\n",
    "\n",
    "sub = sprs.vstack((cl_1_1_sub, cl_1_2_sub, \n",
    "                   cl_2_sub, \n",
    "                   cl_3_sub, \n",
    "                   cl_4_1_sub, cl_4_2_sub, cl_4_3_sub))\n",
    "\n",
    "del(cl_1_1_sub, cl_1_2_sub, \n",
    "    cl_2_sub, \n",
    "    cl_3_sub, \n",
    "    cl_4_1_sub, cl_4_2_sub, cl_4_3_sub)\n",
    "\n",
    "gc.collect()\n",
    "\n",
    "users_num = sub.shape[0]\n",
    "users_num"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(417278, 367)\n"
     ]
    }
   ],
   "source": [
    "print(sub.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#The second parameter in the past row (367) correspond to the maximal number of subscriptions and it is greater \n",
    "#than 200 despite the fact that we downloaded only first 200 subscriptions from each user. Such results are artifacts \n",
    "#of VK API imperfection. Hereafter it will be shown, that there is only 49 users with more than 200 subcriptions. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Form a feature space"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The number of unique public and bloger pages: (2109387,)\n",
      "Political preferences: [-1  1  2  3  4  5  6  7  8  9]\n",
      "Sex: [0 1 2]\n",
      "Ages frames: [0, 18, 23, 26, 36, 45, 51, 66, 86]\n"
     ]
    }
   ],
   "source": [
    "#THE LIST OF UNIQUE SUBSCRIPTIONS\n",
    "subscriptions = np.unique(sub.data)\n",
    "index = dict((subscriptions[i], i) for i in range(len(subscriptions)))\n",
    "np.save('encoded_data/subscriptions.npy', subscriptions)\n",
    "print('The number of unique public and bloger pages:', subscriptions.shape)\n",
    "\n",
    "#THE LIST OF UNIQUE POLITICAL PREFERENCES\n",
    "political = np.unique(dem[:, 2])\n",
    "political = political[political < 10]\n",
    "\n",
    "np.save('encoded_data/political.npy', political)\n",
    "print('Political preferences:', political)\n",
    "\n",
    "#THE LIST OF GENDERS\n",
    "genders = np.unique(dem[:, 0]) # list of genders\n",
    "print('Sex:', genders)\n",
    "\n",
    "#THE LIST OF AGES' FRAMES\n",
    "Ages=[0, 18, 23, 26, 36, 45, 51, 66, 86]\n",
    "print('Ages frames:', Ages)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Encode subscriptions by means of one-hot encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d0895f26850243fcbf2e60abe8d9ab10",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=417278), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "X_sub = lil_matrix((users_num, len(subscriptions)), dtype=np.float16)\n",
    "for i in tqdm(range(users_num)):\n",
    "    j=0\n",
    "    sub_temp = sub[i, j]\n",
    "    while sub_temp != 0:\n",
    "        X_sub[i, index[sub_temp]] = 1\n",
    "        j = j + 1\n",
    "        if j < dim:\n",
    "            sub_temp = sub[i, j]\n",
    "        else:\n",
    "            sub_temp = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_sub = X_sub.tocsr()\n",
    "sprs.save_npz('encoded_data/X_sub_full.npz', X_sub)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Users who have more than 200 subscription"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "sums = X_sub.sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "matrix([[208., 203., 201., 235., 202., 364., 205., 207., 205., 210.,\n",
       "         231., 202., 203., 211., 201., 211., 203., 205., 218., 285.,\n",
       "         223., 292., 204., 203., 228., 251., 228., 224., 250., 211.,\n",
       "         219., 225., 201., 201., 202., 206., 212., 201., 229., 238.,\n",
       "         201., 204., 202., 206., 300., 203., 230., 211., 202.]],\n",
       "       dtype=float16)"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sums[sums>200]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1, 49)"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sums[sums>200].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1x367 sparse matrix of type '<class 'numpy.int64'>'\n",
       "\twith 364 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c = sums == sums.max()\n",
    "c = np.array(c)\n",
    "c = c.ravel()\n",
    "sub[c]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#There is only one user who has the maximal number of subscriptions among the users of the dataset. However, this number is\n",
    "#less than 367 and is equal to 364. Why is this? To find the answer let us analyze users' subscriptions data (before encoding) \n",
    "#in more details."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_copy = sub.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "367\n"
     ]
    }
   ],
   "source": [
    "sub_copy[sub_copy != 0] = 1\n",
    "sums = sub_copy.sum(axis=1)\n",
    "sums = np.array(sums)\n",
    "sums = sums.ravel()\n",
    "print(sums.max())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "strange_list_of_sub = sub[sums == sums.max()].todense()\n",
    "strange_list_of_sub = np.array(strange_list_of_sub)\n",
    "strange_list_of_sub = strange_list_of_sub.ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "292"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(strange_list_of_sub))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#That is the answer! The user who had 367 subscription before encoding had repeated onces. Such duplications were dissapiared \n",
    "#through the encoding procedure  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Encode demography data by means of one-hot encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9e29a99b0027473697c3ac3e7d554238",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=417278), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "X_age = lil_matrix((users_num, len(Ages)+1), dtype=np.float16)\n",
    "X_sex = lil_matrix((users_num, genders.shape[0]), dtype=np.float16)\n",
    "X_political = lil_matrix((users_num, preferences.shape[0]), dtype=np.float16)\n",
    "for i in tqdm(range(users_num)):\n",
    "    \n",
    "    age = dem[i, 1]\n",
    "    sex = dem[i, 0]\n",
    "    political = dem[i, 2]\n",
    "\n",
    "    for a in range(len(Ages)): \n",
    "        if age < Ages[a]:\n",
    "            X_age[i, a] = 1\n",
    "            break\n",
    "        elif age >= Ages[len(Ages)-1]:\n",
    "            X_age[i, len(Ages)] = 1\n",
    "            break\n",
    "    \n",
    "    for j in range(genders.shape[0]): \n",
    "        if sex == genders[j]:\n",
    "            X_sex[i, j] = 1\n",
    "    \n",
    "    for j in range(political.shape[0]): \n",
    "        if political == political[j]:\n",
    "            X_political[i, j] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_age = X_age.tocsr()\n",
    "sprs.save_npz('encoded_data/X_age.npz', X_age)\n",
    "X_sex = X_sex.tocsr()\n",
    "sprs.save_npz('encoded_data/X_sex.npz', X_sex)\n",
    "X_pref = X_pref.tocsr()\n",
    "sprs.save_npz('encoded_data/X_political.npz', X_political)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Labels of classes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = np.zeros(users_num)\n",
    "n=0\n",
    "for i in range(len(nums)):\n",
    "    for j in range(nums[i]):\n",
    "        labels[n] = i + 1\n",
    "        n += 1\n",
    "        \n",
    "np.save('encoded_data/labels.npy', labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Find indices of principal accounts in the publics/blogers-feature space array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#download the array of publics' and blogers' ids that form the feature space\n",
    "subscriptions = np.load('encoded_data/subscriptions.npy')  \n",
    "#MANUALLY CREATED SET OF PRINCIPAL ACCOUNTS\n",
    "principal_accounts_ids = [-72009603, -75625515, -54012242, 38940203, 41362423, 140204185, 129244038]\n",
    "columns_id = np.arange(len(subscriptions))[np.isin(subscriptions, principal_accounts_ids)]\n",
    "np.save('encoded_data/marker_accounts_id.npy', columns_id)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create a new piece of data with eliminated subscriptions on marker accounts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_sub_full = sprs.load_npz('encoded_data/X_sub_full.npz')\n",
    "columns_id = np.load('encoded_data/marker_accounts_id.npy')\n",
    "X_sub_trunc = columns_zero_out(X_sub_full, columns_id)\n",
    "sprs.save_npz('encoded_data/X_sub_trunc.npz', X_sub_trunc)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
